In [ ]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
x = pd.read_csv('job_descriptions.csv')
# Extract job descriptions
job_descriptions = x['Job Description']
all_descriptions = ' '.join(job_descriptions)
# Tokenize the text into words
words = word_tokenize(all_descriptions)
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]
# Calculate frequency distribution
fdist = FreqDist(filtered_words)
# Plot the most common words
plt.figure(figsize=(10, 6))
fdist.plot(30, cumulative=False)
plt.title('Top 30 Most Common Words in Job Descriptions')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()
In [2]:
x.describe()
Out[2]:
| Job Id | latitude | longitude | Company Size | |
|---|---|---|---|---|
| count | 1.615940e+06 | 1.615940e+06 | 1.615940e+06 | 1.615940e+06 |
| mean | 1.548935e+15 | 1.937743e+01 | 1.639926e+01 | 7.370467e+04 |
| std | 8.946722e+14 | 2.355690e+01 | 7.066762e+01 | 3.529886e+04 |
| min | 1.817948e+11 | -4.090060e+01 | -1.751982e+02 | 1.264600e+04 |
| 25% | 7.740508e+14 | 5.152100e+00 | -1.531010e+01 | 4.311400e+04 |
| 50% | 1.547858e+15 | 1.807080e+01 | 1.914510e+01 | 7.363300e+04 |
| 75% | 2.323729e+15 | 3.907420e+01 | 4.757690e+01 | 1.043000e+05 |
| max | 3.099618e+15 | 7.170690e+01 | 1.780650e+02 | 1.348340e+05 |
In [4]:
x.shape
Out[4]:
(1615940, 23)
In [5]:
x.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1615940 entries, 0 to 1615939 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Job Id 1615940 non-null int64 1 Experience 1615940 non-null object 2 Qualifications 1615940 non-null object 3 Salary Range 1615940 non-null object 4 location 1615940 non-null object 5 Country 1615940 non-null object 6 latitude 1615940 non-null float64 7 longitude 1615940 non-null float64 8 Work Type 1615940 non-null object 9 Company Size 1615940 non-null int64 10 Job Posting Date 1615940 non-null object 11 Preference 1615940 non-null object 12 Contact Person 1615940 non-null object 13 Contact 1615940 non-null object 14 Job Title 1615940 non-null object 15 Role 1615940 non-null object 16 Job Portal 1615940 non-null object 17 Job Description 1615940 non-null object 18 Benefits 1615940 non-null object 19 skills 1615940 non-null object 20 Responsibilities 1615940 non-null object 21 Company 1615940 non-null object 22 Company Profile 1610462 non-null object dtypes: float64(2), int64(2), object(19) memory usage: 283.6+ MB
In [6]:
x.dtypes
Out[6]:
Job Id int64 Experience object Qualifications object Salary Range object location object Country object latitude float64 longitude float64 Work Type object Company Size int64 Job Posting Date object Preference object Contact Person object Contact object Job Title object Role object Job Portal object Job Description object Benefits object skills object Responsibilities object Company object Company Profile object dtype: object
In [ ]:
In [13]:
import matplotlib.pyplot as plt
plt.hist(x['Company Size'], bins=20)
plt.xlabel('Company Size')
plt.ylabel('Frequency')
plt.title('Histogram of Company Size')
plt.show()
In [14]:
job_id_bins = pd.qcut(x['Job Id'], q=5)
x.groupby(job_id_bins)['Company Size'].mean().plot(kind='bar')
plt.xlabel('Job ID')
plt.ylabel('Average Company Size')
plt.title('Bar Plot of Company Size by Job ID')
plt.show()
C:\Users\pc\AppData\Local\Temp\ipykernel_22428\2552711917.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. x.groupby(job_id_bins)['Company Size'].mean().plot(kind='bar')
In [16]:
import seaborn as sns
sns.violinplot(x='latitude', y='Company Size', data=x)
plt.xlabel('Latitude')
plt.ylabel('Company Size')
plt.title('Violin Plot of Company Size by Latitude')
plt.show()
In [18]:
plt.scatter(x['latitude'], x['longitude'], alpha=0.5)
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Scatter Plot of Latitude vs Longitude')
plt.show()
In [19]:
plt.figure(figsize=(10,6))
x['Role'].value_counts().plot(kind='bar')
plt.xlabel('Role')
plt.ylabel('Frequency')
plt.title('Bar Plot of Role Frequency')
plt.show()
In [23]:
plt.hist(x['Salary Range'], bins=20)
plt.xlabel('Salary Range')
plt.ylabel('Frequency')
plt.title('Histogram of Salary Range')
plt.show()
In [24]:
x.columns
Out[24]:
Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
'skills', 'Responsibilities', 'Company', 'Company Profile'],
dtype='object')
In [26]:
x = x.drop(columns=['latitude'])
In [28]:
x = x.drop(columns=['longitude'])
In [29]:
x.columns
Out[29]:
Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
'Country', 'Work Type', 'Company Size', 'Job Posting Date',
'Preference', 'Contact Person', 'Contact', 'Job Title', 'Role',
'Job Portal', 'Job Description', 'Benefits', 'skills',
'Responsibilities', 'Company', 'Company Profile'],
dtype='object')
In [12]:
plt.figure(figsize=(8,8))
x['Work Type'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.ylabel('Work Type')
plt.title('Pie Chart of Work Type')
plt.show()
In [14]:
# Descriptive statistics for numeric columns
numeric_columns = ['Job Id', 'Salary Range', 'Company Size']
numeric_stats = x[numeric_columns].describe()
print(numeric_stats)
Job Id Company Size count 1.615940e+06 1.615940e+06 mean 1.548935e+15 7.370467e+04 std 8.946722e+14 3.529886e+04 min 1.817948e+11 1.264600e+04 25% 7.740508e+14 4.311400e+04 50% 1.547858e+15 7.363300e+04 75% 2.323729e+15 1.043000e+05 max 3.099618e+15 1.348340e+05
In [15]:
# Distribution of categorical columns
categorical_columns = ['Work Type', 'Role', 'Job Portal']
for column in categorical_columns:
print(x[column].value_counts())
Work Type
Part-Time 324044
Temporary 323439
Contract 323131
Intern 323090
Full-Time 322236
Name: count, dtype: int64
Role
Interaction Designer 20580
Network Administrator 17470
User Interface Designer 14036
Social Media Manager 13945
User Experience Designer 13935
...
Inventory Control Specialist 3342
Budget Analyst 3335
Clinical Nurse Manager 3324
Social Science Researcher 3321
Paid Advertising Specialist 3306
Name: count, Length: 376, dtype: int64
Job Portal
FlexJobs 129879
Stack Overflow Jobs 129379
Jobs2Careers 129245
Snagajob 129088
USAJOBS 129066
SimplyHired 129059
The Muse 129033
Idealist 128952
Internships.com 128790
Monster 65058
Dice 64927
ZipRecruiter 64805
Indeed 64776
CareerBuilder 64752
LinkedIn 64664
Glassdoor 64467
Name: count, dtype: int64
In [2]:
import pandas as pd
x = pd.read_csv('job_descriptions.csv')
print(x)
Job Id Experience Qualifications Salary Range \
0 1089843540111562 5 to 15 Years M.Tech $59K-$99K
1 398454096642776 2 to 12 Years BCA $56K-$116K
2 481640072963533 0 to 12 Years PhD $61K-$104K
3 688192671473044 4 to 11 Years PhD $65K-$91K
4 117057806156508 1 to 12 Years MBA $64K-$87K
... ... ... ... ...
1615935 134563577088850 0 to 12 Years B.Tech $64K-$114K
1615936 618604818190827 2 to 14 Years M.Tech $62K-$130K
1615937 615471367712200 4 to 15 Years BCA $60K-$96K
1615938 804137342023945 5 to 15 Years BCA $65K-$103K
1615939 404645755314484 1 to 11 Years BBA $56K-$109K
location Country latitude longitude Work Type \
0 Douglas Isle of Man 54.2361 -4.5481 Intern
1 Ashgabat Turkmenistan 38.9697 59.5563 Intern
2 Macao Macao SAR, China 22.1987 113.5439 Temporary
3 Porto-Novo Benin 9.3077 2.3158 Full-Time
4 Santiago Chile -35.6751 -71.5429 Intern
... ... ... ... ... ...
1615935 Malabo (de jure), Equatorial Guinea 1.6508 10.2679 Full-Time
1615936 Warsaw Poland 51.9194 19.1451 Intern
1615937 Ashgabat Turkmenistan 38.9697 59.5563 Part-Time
1615938 Ouagadougou Burkina Faso 12.2383 -1.5616 Full-Time
1615939 Asmara Eritrea 15.1794 39.7823 Part-Time
Company Size ... Contact \
0 26801 ... 001-381-930-7517x737
1 100340 ... 461-509-4216
2 84525 ... 9687619505
3 129896 ... +1-820-643-5431x47576
4 53944 ... 343.975.4702x9340
... ... ... ...
1615935 18281 ... 950-451-5843
1615936 63621 ... 676.387.1572x71877
1615937 114287 ... 537.384.6193x5284
1615938 45009 ... (484)257-4755x5346
1615939 87637 ... (989)703-9723
Job Title Role \
0 Digital Marketing Specialist Social Media Manager
1 Web Developer Frontend Web Developer
2 Operations Manager Quality Control Manager
3 Network Engineer Wireless Network Engineer
4 Event Manager Conference Manager
... ... ...
1615935 Mechanical Engineer Mechanical Design Engineer
1615936 IT Manager IT Director
1615937 Mechanical Engineer Mechanical Design Engineer
1615938 HR Coordinator Training Coordinator
1615939 Event Planner Wedding Planner
Job Portal \
0 Snagajob
1 Idealist
2 Jobs2Careers
3 FlexJobs
4 Jobs2Careers
... ...
1615935 ZipRecruiter
1615936 USAJOBS
1615937 Indeed
1615938 Stack Overflow Jobs
1615939 USAJOBS
Job Description \
0 Social Media Managers oversee an organizations...
1 Frontend Web Developers design and implement u...
2 Quality Control Managers establish and enforce...
3 Wireless Network Engineers design, implement, ...
4 A Conference Manager coordinates and manages c...
... ...
1615935 Mechanical Design Engineers create and develop...
1615936 An IT Director oversees an organizations IT de...
1615937 Mechanical Design Engineers create and develop...
1615938 Training Coordinators design and implement emp...
1615939 Wedding Planners specialize in organizing wedd...
Benefits \
0 {'Flexible Spending Accounts (FSAs), Relocatio...
1 {'Health Insurance, Retirement Plans, Paid Tim...
2 {'Legal Assistance, Bonuses and Incentive Prog...
3 {'Transportation Benefits, Professional Develo...
4 {'Flexible Spending Accounts (FSAs), Relocatio...
... ...
1615935 {'Employee Assistance Programs (EAP), Tuition ...
1615936 {'Health Insurance, Retirement Plans, Paid Tim...
1615937 {'Tuition Reimbursement, Stock Options or Equi...
1615938 {'Casual Dress Code, Social and Recreational A...
1615939 {'Transportation Benefits, Professional Develo...
skills \
0 Social media platforms (e.g., Facebook, Twitte...
1 HTML, CSS, JavaScript Frontend frameworks (e.g...
2 Quality control processes and methodologies St...
3 Wireless network design and architecture Wi-Fi...
4 Event planning Conference logistics Budget man...
... ...
1615935 Mechanical engineering CAD software (e.g., Sol...
1615936 Strategic IT planning Leadership and managemen...
1615937 Mechanical engineering CAD software (e.g., Sol...
1615938 Training program coordination Training materia...
1615939 Wedding planning Venue selection Catering and ...
Responsibilities \
0 Manage and grow social media accounts, create ...
1 Design and code user interfaces for websites, ...
2 Establish and enforce quality control standard...
3 Design, configure, and optimize wireless netwo...
4 Specialize in conference and convention planni...
... ...
1615935 Design mechanical systems, components, and pro...
1615936 Provide strategic leadership for IT department...
1615937 Design mechanical systems, components, and pro...
1615938 Coordinate employee training programs, track t...
1615939 Specialize in wedding planning, assisting coup...
Company \
0 Icahn Enterprises
1 PNC Financial Services Group
2 United Services Automobile Assn.
3 Hess
4 Cairn Energy
... ...
1615935 The Hershey Company
1615936 EQT
1615937 KLA
1615938 Mahindra & Mahindra
1615939 Ashtead Group
Company Profile
0 {"Sector":"Diversified","Industry":"Diversifie...
1 {"Sector":"Financial Services","Industry":"Com...
2 {"Sector":"Insurance","Industry":"Insurance: P...
3 {"Sector":"Energy","Industry":"Mining, Crude-O...
4 {"Sector":"Energy","Industry":"Energy - Oil & ...
... ...
1615935 {"Sector":"Food and Beverage/Confectionery","I...
1615936 {"Sector":"Energy","Industry":"Energy","City":...
1615937 {"Sector":"Technology","Industry":"Semiconduct...
1615938 {"Sector":"Automotive","Industry":"Automotive"...
1615939 {"Sector":"Equipment Rental","Industry":"Equip...
[1615940 rows x 23 columns]
In [3]:
import matplotlib.pyplot as pd
# Your plotting code
plt.boxplot(x['Experience'])
plt.ylabel('Experience')
plt.title('Box Plot of Experience')
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 4 1 import matplotlib.pyplot as pd 3 # Your plotting code ----> 4 plt.boxplot(x['Experience']) 5 plt.ylabel('Experience') 6 plt.title('Box Plot of Experience') NameError: name 'x' is not defined
In [6]:
import pandas as pd
x = pd.read_csv('job_descriptions.csv')
plt.boxplot(x['Company Size'])
plt.ylabel('Company Size')
plt.title('Box Plot of Company Size')
plt.show()
In [5]:
import pandas as pd
# Load dataset
data = pd.read_csv("your_dataset.csv")
# Descriptive analysis
print("Summary Statistics:")
print(data.describe())
print("\nUnique Values and Frequency Distributions:")
for column in data.columns:
print(column)
print(data[column].value_counts())
print()
# Text analysis of job descriptions
# Example: Sentiment analysis
sentiments = [TextBlob(text).sentiment.polarity for text in data["Job Description"]]
data["Sentiment"] = sentiments
# Company profile analysis
# Example: Analyze company profiles to understand company reputation, values, and culture
# You can use techniques like keyword extraction and sentiment analysis for this
# Further analysis and visualization as needed
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[5], line 5 1 import pandas as pd 4 # Load dataset ----> 5 data = pd.read_csv("your_dataset.csv") 7 # Descriptive analysis 8 print("Summary Statistics:") File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend) 935 kwds_defaults = _refine_defaults_read( 936 dialect, 937 delimiter, (...) 944 dtype_backend=dtype_backend, 945 ) 946 kwds.update(kwds_defaults) --> 948 return _read(filepath_or_buffer, kwds) File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:611, in _read(filepath_or_buffer, kwds) 608 _validate_names(kwds.get("names", None)) 610 # Create the parser. --> 611 parser = TextFileReader(filepath_or_buffer, **kwds) 613 if chunksize or iterator: 614 return parser File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds) 1445 self.options["has_index_names"] = kwds["has_index_names"] 1447 self.handles: IOHandles | None = None -> 1448 self._engine = self._make_engine(f, self.engine) File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1705, in TextFileReader._make_engine(self, f, engine) 1703 if "b" not in mode: 1704 mode += "b" -> 1705 self.handles = get_handle( 1706 f, 1707 mode, 1708 encoding=self.options.get("encoding", None), 1709 compression=self.options.get("compression", None), 1710 memory_map=self.options.get("memory_map", False), 1711 is_text=is_text, 1712 errors=self.options.get("encoding_errors", "strict"), 1713 storage_options=self.options.get("storage_options", None), 1714 ) 1715 assert self.handles is not None 1716 f = self.handles.handle File ~\anaconda3\Lib\site-packages\pandas\io\common.py:863, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 858 elif isinstance(handle, str): 859 # Check whether the filename is to be opened in binary mode. 860 # Binary mode does not support 'encoding' and 'newline'. 861 if ioargs.encoding and "b" not in ioargs.mode: 862 # Encoding --> 863 handle = open( 864 handle, 865 ioargs.mode, 866 encoding=ioargs.encoding, 867 errors=errors, 868 newline="", 869 ) 870 else: 871 # Binary mode 872 handle = open(handle, ioargs.mode) FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'
In [7]:
!pip install spacy
Collecting spacy Downloading spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata (27 kB) Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy) Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB) Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy) Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB) Collecting murmurhash<1.1.0,>=0.28.0 (from spacy) Downloading murmurhash-1.0.10-cp311-cp311-win_amd64.whl.metadata (2.0 kB) Collecting cymem<2.1.0,>=2.0.2 (from spacy) Downloading cymem-2.0.8-cp311-cp311-win_amd64.whl.metadata (8.6 kB) Collecting preshed<3.1.0,>=3.0.2 (from spacy) Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl.metadata (2.2 kB) Collecting thinc<8.3.0,>=8.2.2 (from spacy) Downloading thinc-8.2.3-cp311-cp311-win_amd64.whl.metadata (15 kB) Collecting wasabi<1.2.0,>=0.9.1 (from spacy) Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB) Collecting srsly<3.0.0,>=2.4.3 (from spacy) Downloading srsly-2.4.8-cp311-cp311-win_amd64.whl.metadata (20 kB) Collecting catalogue<2.1.0,>=2.0.6 (from spacy) Downloading catalogue-2.0.10-py3-none-any.whl.metadata (14 kB) Collecting weasel<0.4.0,>=0.1.0 (from spacy) Downloading weasel-0.3.4-py3-none-any.whl.metadata (4.7 kB) Collecting typer<0.10.0,>=0.3.0 (from spacy) Downloading typer-0.9.4-py3-none-any.whl.metadata (14 kB) Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (5.2.1) Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (4.65.0) Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (2.31.0) Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (1.10.12) Requirement already satisfied: jinja2 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (3.1.3) Requirement already satisfied: setuptools in c:\users\pc\anaconda3\lib\site-packages (from spacy) (68.2.2) Requirement already satisfied: packaging>=20.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (23.1) Collecting langcodes<4.0.0,>=3.2.0 (from spacy) Downloading langcodes-3.4.0-py3-none-any.whl.metadata (29 kB) Requirement already satisfied: numpy>=1.19.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (1.26.4) Collecting language-data>=1.2 (from langcodes<4.0.0,>=3.2.0->spacy) Downloading language_data-1.2.0-py3-none-any.whl.metadata (4.3 kB) Requirement already satisfied: typing-extensions>=4.2.0 in c:\users\pc\anaconda3\lib\site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.9.0) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.2.2) Collecting blis<0.8.0,>=0.7.8 (from thinc<8.3.0,>=8.2.2->spacy) Downloading blis-0.7.11-cp311-cp311-win_amd64.whl.metadata (7.6 kB) Collecting confection<1.0.0,>=0.0.1 (from thinc<8.3.0,>=8.2.2->spacy) Downloading confection-0.1.4-py3-none-any.whl.metadata (19 kB) Requirement already satisfied: colorama in c:\users\pc\anaconda3\lib\site-packages (from tqdm<5.0.0,>=4.38.0->spacy) (0.4.6) Requirement already satisfied: click<9.0.0,>=7.1.1 in c:\users\pc\anaconda3\lib\site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7) Collecting cloudpathlib<0.17.0,>=0.7.0 (from weasel<0.4.0,>=0.1.0->spacy) Downloading cloudpathlib-0.16.0-py3-none-any.whl.metadata (14 kB) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\pc\anaconda3\lib\site-packages (from jinja2->spacy) (2.1.3) Collecting marisa-trie>=0.7.7 (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy) Downloading marisa_trie-1.1.0-cp311-cp311-win_amd64.whl.metadata (8.8 kB) Downloading spacy-3.7.4-cp311-cp311-win_amd64.whl (12.1 MB) ---------------------------------------- 0.0/12.1 MB ? eta -:--:-- - -------------------------------------- 0.5/12.1 MB 16.5 MB/s eta 0:00:01 ----- ---------------------------------- 1.8/12.1 MB 22.7 MB/s eta 0:00:01 ---------- ----------------------------- 3.1/12.1 MB 24.7 MB/s eta 0:00:01 --------------- ------------------------ 4.8/12.1 MB 25.4 MB/s eta 0:00:01 --------------------- ------------------ 6.5/12.1 MB 29.8 MB/s eta 0:00:01 ------------------------- -------------- 7.8/12.1 MB 29.4 MB/s eta 0:00:01 -------------------------------- ------- 9.7/12.1 MB 31.0 MB/s eta 0:00:01 ------------------------------------- -- 11.2/12.1 MB 34.6 MB/s eta 0:00:01 --------------------------------------- 12.1/12.1 MB 34.4 MB/s eta 0:00:01 ---------------------------------------- 12.1/12.1 MB 29.7 MB/s eta 0:00:00 Downloading catalogue-2.0.10-py3-none-any.whl (17 kB) Downloading cymem-2.0.8-cp311-cp311-win_amd64.whl (39 kB) Downloading langcodes-3.4.0-py3-none-any.whl (182 kB) ---------------------------------------- 0.0/182.0 kB ? eta -:--:-- --------------------------------------- 182.0/182.0 kB 10.7 MB/s eta 0:00:00 Downloading murmurhash-1.0.10-cp311-cp311-win_amd64.whl (25 kB) Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl (122 kB) ---------------------------------------- 0.0/122.3 kB ? eta -:--:-- ---------------------------------------- 122.3/122.3 kB ? eta 0:00:00 Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB) Downloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB) Downloading srsly-2.4.8-cp311-cp311-win_amd64.whl (479 kB) ---------------------------------------- 0.0/479.7 kB ? eta -:--:-- --------------------------------------- 479.7/479.7 kB 14.7 MB/s eta 0:00:00 Downloading thinc-8.2.3-cp311-cp311-win_amd64.whl (1.5 MB) ---------------------------------------- 0.0/1.5 MB ? eta -:--:-- --------------------------------------- 1.5/1.5 MB 31.2 MB/s eta 0:00:01 ---------------------------------------- 1.5/1.5 MB 23.3 MB/s eta 0:00:00 Downloading typer-0.9.4-py3-none-any.whl (45 kB) ---------------------------------------- 0.0/46.0 kB ? eta -:--:-- ---------------------------------------- 46.0/46.0 kB 2.2 MB/s eta 0:00:00 Downloading wasabi-1.1.2-py3-none-any.whl (27 kB) Downloading weasel-0.3.4-py3-none-any.whl (50 kB) ---------------------------------------- 0.0/50.1 kB ? eta -:--:-- ---------------------------------------- 50.1/50.1 kB 2.5 MB/s eta 0:00:00 Downloading blis-0.7.11-cp311-cp311-win_amd64.whl (6.6 MB) ---------------------------------------- 0.0/6.6 MB ? eta -:--:-- -------- ------------------------------- 1.5/6.6 MB 47.6 MB/s eta 0:00:01 ------------------- -------------------- 3.2/6.6 MB 40.6 MB/s eta 0:00:01 ------------------------------ --------- 5.1/6.6 MB 40.6 MB/s eta 0:00:01 -------------------------------------- - 6.4/6.6 MB 37.4 MB/s eta 0:00:01 ---------------------------------------- 6.6/6.6 MB 32.5 MB/s eta 0:00:00 Downloading cloudpathlib-0.16.0-py3-none-any.whl (45 kB) ---------------------------------------- 0.0/45.0 kB ? eta -:--:-- ---------------------------------------- 45.0/45.0 kB ? eta 0:00:00 Downloading confection-0.1.4-py3-none-any.whl (35 kB) Downloading language_data-1.2.0-py3-none-any.whl (5.4 MB) ---------------------------------------- 0.0/5.4 MB ? eta -:--:-- ------------ --------------------------- 1.7/5.4 MB 35.5 MB/s eta 0:00:01 ------------------------ --------------- 3.3/5.4 MB 34.9 MB/s eta 0:00:01 -------------------------------------- - 5.2/5.4 MB 36.9 MB/s eta 0:00:01 --------------------------------------- 5.4/5.4 MB 38.3 MB/s eta 0:00:01 ---------------------------------------- 5.4/5.4 MB 26.5 MB/s eta 0:00:00 Downloading marisa_trie-1.1.0-cp311-cp311-win_amd64.whl (152 kB) ---------------------------------------- 0.0/152.6 kB ? eta -:--:-- ---------------------------------------- 152.6/152.6 kB 8.9 MB/s eta 0:00:00 Installing collected packages: cymem, wasabi, spacy-loggers, spacy-legacy, murmurhash, marisa-trie, cloudpathlib, catalogue, blis, typer, srsly, preshed, language-data, langcodes, confection, weasel, thinc, spacy Successfully installed blis-0.7.11 catalogue-2.0.10 cloudpathlib-0.16.0 confection-0.1.4 cymem-2.0.8 langcodes-3.4.0 language-data-1.2.0 marisa-trie-1.1.0 murmurhash-1.0.10 preshed-3.0.9 spacy-3.7.4 spacy-legacy-3.0.12 spacy-loggers-1.0.5 srsly-2.4.8 thinc-8.2.3 typer-0.9.4 wasabi-1.1.2 weasel-0.3.4
In [8]:
!python -m spacy download en_core_web_sm
Collecting en-core-web-sm==3.7.1
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
---------------------------------------- 0.0/12.8 MB ? eta -:--:--
---------------------------------------- 0.0/12.8 MB ? eta -:--:--
--------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
--------------------------------------- 0.1/12.8 MB 469.7 kB/s eta 0:00:28
- -------------------------------------- 0.4/12.8 MB 2.2 MB/s eta 0:00:06
----- ---------------------------------- 1.6/12.8 MB 7.9 MB/s eta 0:00:02
--------- ------------------------------ 3.0/12.8 MB 12.1 MB/s eta 0:00:01
-------------- ------------------------- 4.7/12.8 MB 15.8 MB/s eta 0:00:01
-------------------- ------------------- 6.6/12.8 MB 19.3 MB/s eta 0:00:01
------------------------ --------------- 8.0/12.8 MB 20.5 MB/s eta 0:00:01
------------------------------ --------- 9.8/12.8 MB 22.5 MB/s eta 0:00:01
-------------------------------- ------ 10.8/12.8 MB 34.4 MB/s eta 0:00:01
---------------------------------- ---- 11.4/12.8 MB 29.7 MB/s eta 0:00:01
------------------------------------ -- 12.1/12.8 MB 28.4 MB/s eta 0:00:01
-------------------------------------- 12.8/12.8 MB 28.5 MB/s eta 0:00:01
-------------------------------------- 12.8/12.8 MB 28.5 MB/s eta 0:00:01
-------------------------------------- 12.8/12.8 MB 28.5 MB/s eta 0:00:01
-------------------------------------- 12.8/12.8 MB 28.5 MB/s eta 0:00:01
--------------------------------------- 12.8/12.8 MB 18.7 MB/s eta 0:00:00
Requirement already satisfied: spacy<3.8.0,>=3.7.2 in c:\users\pc\anaconda3\lib\site-packages (from en-core-web-sm==3.7.1) (3.7.4)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)
Requirement already satisfied: weasel<0.4.0,>=0.1.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.4)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (5.2.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.65.0)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.31.0)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.10.12)
Requirement already satisfied: jinja2 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.3)
Requirement already satisfied: setuptools in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (68.2.2)
Requirement already satisfied: packaging>=20.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (23.1)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4.0)
Requirement already satisfied: numpy>=1.19.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)
Requirement already satisfied: language-data>=1.2 in c:\users\pc\anaconda3\lib\site-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.2.0)
Requirement already satisfied: typing-extensions>=4.2.0 in c:\users\pc\anaconda3\lib\site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.9.0)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.2.2)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in c:\users\pc\anaconda3\lib\site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in c:\users\pc\anaconda3\lib\site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)
Requirement already satisfied: colorama in c:\users\pc\anaconda3\lib\site-packages (from tqdm<5.0.0,>=4.38.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.4.6)
Requirement already satisfied: click<9.0.0,>=7.1.1 in c:\users\pc\anaconda3\lib\site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)
Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in c:\users\pc\anaconda3\lib\site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\pc\anaconda3\lib\site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.3)
Requirement already satisfied: marisa-trie>=0.7.7 in c:\users\pc\anaconda3\lib\site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.0)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
In [9]:
import pandas as pd
import spacy
from collections import Counter
from textblob import TextBlob
# Load dataset
data = pd.read_csv("your_dataset.csv")
# Load spaCy model
nlp = spacy.load("en_core_web_sm")
# Keyword extraction
skills_responsibilities = []
company_values = []
for text in data["Job Description"]:
doc = nlp(text)
for token in doc:
if token.pos_ == "NOUN":
if token.dep_ == "dobj" or token.dep_ == "attr":
skills_responsibilities.append(token.text)
elif token.dep_ == "nsubj" or token.dep_ == "pobj":
company_values.append(token.text)
# Sentiment analysis
sentiments = [TextBlob(text).sentiment.polarity for text in data["Job Description"]]
data["Sentiment"] = sentiments
# Optimization for search engine visibility (Not implemented in this example)
# Further analysis and visualization as needed
print("Common Skills and Responsibilities:")
print(Counter(skills_responsibilities).most_common(10))
print("\nCompany Values:")
print(Counter(company_values).most_common(5))
print("\nSentiment Analysis:")
print(data[["Job Description", "Sentiment"]])
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[9], line 4 2 import spacy 3 from collections import Counter ----> 4 from textblob import TextBlob 6 # Load dataset 7 data = pd.read_csv("your_dataset.csv") ModuleNotFoundError: No module named 'textblob'
In [11]:
!pip install textblob
Requirement already satisfied: textblob in c:\users\pc\anaconda3\lib\site-packages (0.18.0.post0) Requirement already satisfied: nltk>=3.8 in c:\users\pc\anaconda3\lib\site-packages (from textblob) (3.8.1) Requirement already satisfied: click in c:\users\pc\anaconda3\lib\site-packages (from nltk>=3.8->textblob) (8.1.7) Requirement already satisfied: joblib in c:\users\pc\anaconda3\lib\site-packages (from nltk>=3.8->textblob) (1.2.0) Requirement already satisfied: regex>=2021.8.3 in c:\users\pc\anaconda3\lib\site-packages (from nltk>=3.8->textblob) (2023.10.3) Requirement already satisfied: tqdm in c:\users\pc\anaconda3\lib\site-packages (from nltk>=3.8->textblob) (4.65.0) Requirement already satisfied: colorama in c:\users\pc\anaconda3\lib\site-packages (from click->nltk>=3.8->textblob) (0.4.6)
In [12]:
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Read the CSV file into a DataFrame
df = pd.read_csv('your_data.csv')
# Tokenize the text in the 'Job Description' column
descriptions = df['Job Description'].dropna().str.split()
# Flatten the list of lists and count the frequency of words
word_counts = Counter([word for sublist in descriptions for word in sublist])
# Plot word frequency using a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Job Description')
plt.show()
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[12], line 3 1 import pandas as pd 2 from collections import Counter ----> 3 from wordcloud import WordCloud 4 import matplotlib.pyplot as plt 6 # Read the CSV file into a DataFrame ModuleNotFoundError: No module named 'wordcloud'
In [ ]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
# Load the dataset
x = pd.read_csv('job_descriptions.csv')
# Extract job descriptions
job_descriptions = x['Job Description']
# Concatenate all job descriptions into a single string
all_descriptions = ' '.join(job_descriptions)
# Tokenize the text into words
words = word_tokenize(all_descriptions)
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]
# Calculate frequency distribution
fdist = FreqDist(filtered_words)
# Plot the most common words
plt.figure(figsize=(10, 6))
fdist.plot(30, cumulative=False)
plt.title('Top 30 Most Common Words in Job Descriptions')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_csv('job_descriptions.csv')
# Plotting skills
skills_counts = data['skills'].value_counts().head(10)
plt.figure(figsize=(10,6))
skills_counts.plot(kind='bar', color='skyblue')
plt.title('Top 10 Most Required Skills')
plt.xlabel('Skills')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
# Plotting responsibilities
responsibilities_counts = data['Responsibilities'].value_counts().head(10)
plt.figure(figsize=(10,6))
responsibilities_counts.plot(kind='bar', color='lightgreen')
plt.title('Top 10 Most Common Responsibilities')
plt.xlabel('Responsibilities')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
# Plotting companies
company_counts = data['Company'].value_counts().head(10)
plt.figure(figsize=(10,6))
company_counts.plot(kind='bar', color='salmon')
plt.title('Top 10 Companies with Most Job Openings')
plt.xlabel('Company')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
# Plotting company profiles
company_profile_counts = data['Company Profile'].value_counts().head(10)
plt.figure(figsize=(10,6))
company_profile_counts.plot(kind='bar', color='gold')
plt.title('Top 10 Most Common Company Profiles')
plt.xlabel('Company Profile')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
In [2]:
# Plotting skills
skills_counts = data['skills'].value_counts().head(10)
plt.figure(figsize=(10,6))
skills_counts.plot(kind='bar', color='skyblue')
plt.title('Top 10 Most Required Skills')
plt.xlabel('Skills')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
In [3]:
# Plotting responsibilities
responsibilities_counts = data['Responsibilities'].value_counts().head(10)
plt.figure(figsize=(10,6))
responsibilities_counts.plot(kind='bar', color='lightgreen')
plt.title('Top 10 Most Common Responsibilities')
plt.xlabel('Responsibilities')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
In [4]:
# Plotting companies
company_counts = data['Company'].value_counts().head(10)
plt.figure(figsize=(10,6))
company_counts.plot(kind='bar', color='salmon')
plt.title('Top 10 Companies with Most Job Openings')
plt.xlabel('Company')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
In [5]:
# Plotting company profiles
company_profile_counts = data['Company Profile'].value_counts().head(10)
plt.figure(figsize=(10,6))
company_profile_counts.plot(kind='bar', color='gold')
plt.title('Top 10 Most Common Company Profiles')
plt.xlabel('Company Profile')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
# Example of keyword extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(x['skills'].dropna())
tfidf_features = tfidf_vectorizer.get_feature_names_out()
print("Top 10 keywords for skills:")
print(tfidf_features[:10])
# Example of sentiment analysis using TextBlob
x['Job_Description_Sentiment'] = x['Job Description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
print("Average sentiment of job descriptions:", x['Job_Description_Sentiment'].mean())
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[5], line 6 4 # Example of keyword extraction using TF-IDF 5 tfidf_vectorizer = TfidfVectorizer(max_features=1000) ----> 6 tfidf_matrix = tfidf_vectorizer.fit_transform(x['skills'].dropna()) 7 tfidf_features = tfidf_vectorizer.get_feature_names_out() 8 print("Top 10 keywords for skills:") NameError: name 'x' is not defined
In [2]:
#Analyze skill gaps and match candidate profiles to job requirements.
Identify emerging skills and responsibilities in specific industries.
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
# Load the dataset
data = pd.read_csv('job_descriptions.csv')
# Tokenize and extract skills from job descriptions
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)
X = vectorizer.fit_transform(data['Job Description'].dropna())
skills = vectorizer.get_feature_names_out()
# Categorize skills into relevant categories
technical_skills = []
soft_skills = []
for skill in skills:
if 'technical' in skill.lower() or 'programming' in skill.lower():
technical_skills.append(skill)
else:
soft_skills.append(skill)
print("Technical Skills:", technical_skills)
print("Soft Skills:", soft_skills)
Technical Skills: ['provide technical', 'technical', 'technical support'] Soft Skills: ['account', 'accounting', 'accounts', 'accurate', 'achieve', 'acquisitions', 'across', 'activities', 'address', 'administrative', 'administrative support', 'administrative tasks', 'administrators', 'administrators manage', 'advertising', 'advice', 'advise', 'advisors', 'aesthetically', 'aesthetically pleasing', 'aesthetics', 'agile', 'align', 'align with', 'all', 'also', 'an', 'an organization', 'an organizations', 'analysis', 'analysis and', 'analyst', 'analysts', 'analysts analyze', 'analyze', 'analyze data', 'analyze financial', 'analyze social', 'analyzes', 'and', 'and analysis', 'and analyze', 'and analyzes', 'and applications', 'and assisting', 'and collaborate', 'and customer', 'and data', 'and design', 'and develop', 'and develops', 'and educate', 'and efficiency', 'and efficient', 'and engagement', 'and engaging', 'and enhance', 'and ensure', 'and ensures', 'and ensuring', 'and execute', 'and financial', 'and functional', 'and handling', 'and implement', 'and implementing', 'and improve', 'and interactive', 'and layouts', 'and maintain', 'and maintaining', 'and manage', 'and manages', 'and managing', 'and market', 'and meet', 'and optimize', 'and other', 'and oversee', 'and performance', 'and promote', 'and provide', 'and quality', 'and recommendations', 'and regulations', 'and respond', 'and responsive', 'and schedule', 'and security', 'and server', 'and services', 'and software', 'and support', 'and system', 'and systems', 'and treat', 'and user', 'and visually', 'and within', 'and work', 'answer', 'appealing', 'appealing and', 'appealing user', 'application', 'applications', 'applications they', 'architect', 'architectural', 'are', 'are responsible', 'art', 'artistic', 'as', 'aspects', 'aspects of', 'assess', 'assesses', 'assist', 'assist in', 'assist with', 'assistance', 'assistant', 'assisting', 'assists', 'assurance', 'audiences', 'automated', 'automation', 'availability', 'awareness', 'awareness and', 'backend', 'behavior', 'behaviors', 'behaviors and', 'being', 'benefit', 'best', 'between', 'brand', 'brand awareness', 'budget', 'budgeting', 'budgets', 'budgets and', 'build', 'build and', 'build relationships', 'building', 'building and', 'buildings', 'business', 'business decisions', 'business objectives', 'businesses', 'buttons', 'buttons and', 'by', 'by considering', 'campaigns', 'campaigns they', 'care', 'care and', 'care to', 'cases', 'chain', 'channels', 'child', 'children', 'client', 'clients', 'clients in', 'clients or', 'clinical', 'closely', 'closely with', 'cloud', 'code', 'cohesive', 'cohesive and', 'collaborate', 'collaborate with', 'collect', 'communicate', 'communication', 'communication and', 'communication to', 'community', 'company', 'companys', 'competitive', 'complex', 'compliance', 'compliance with', 'components', 'computer', 'computer networks', 'conditions', 'conduct', 'conduct keyword', 'conduct user', 'conducts', 'conferences', 'configure', 'configure monitor', 'considering', 'considering user', 'consistency', 'construction', 'consultant', 'consumer', 'consumer behavior', 'content', 'content and', 'content engage', 'contracts', 'contracts and', 'control', 'coordinate', 'coordinates', 'coordinating', 'coordinators', 'corporate', 'correspondence', 'cost', 'cost effective', 'cost saving', 'costs', 'counsel', 'create', 'create and', 'create intuitive', 'create meaningful', 'create visually', 'creating', 'creating user', 'cross', 'cross functional', 'custody', 'customer', 'customer satisfaction', 'customer success', 'customer support', 'customers', 'customers they', 'daily', 'data', 'data analysis', 'data and', 'data driven', 'data identify', 'data integrity', 'data storage', 'data to', 'database', 'databases', 'databases ensuring', 'day', 'decision', 'decision making', 'decisions', 'defects', 'delivery', 'demand', 'departments', 'design', 'design and', 'design implement', 'design layouts', 'design prototypes', 'designer', 'designers', 'designers create', 'designers focus', 'designers specialize', 'designing', 'designing and', 'designing user', 'designs', 'designs and', 'develop', 'develop and', 'developer', 'developer is', 'developers', 'developing', 'development', 'development and', 'development teams', 'develops', 'devices', 'diagnose', 'diagnose and', 'different', 'digital', 'digital interfaces', 'digital marketing', 'directors', 'disputes', 'distribution', 'document', 'documentation', 'documents', 'drive', 'drive brand', 'driven', 'driven decision', 'educate', 'effective', 'effectively', 'efficiency', 'efficiency and', 'efficient', 'efforts', 'electrical', 'elements', 'elements to', 'email', 'email marketing', 'employee', 'employees', 'employment', 'end', 'engage', 'engage with', 'engagement', 'engaging', 'engaging user', 'engine', 'engineer', 'engineers', 'engineers design', 'engineers focus', 'engines', 'enhance', 'enhance the', 'enhancing', 'ensure', 'ensure cohesive', 'ensure compliance', 'ensure efficient', 'ensure product', 'ensure the', 'ensure they', 'ensures', 'ensuring', 'ensuring compliance', 'ensuring data', 'ensuring optimal', 'ensuring seamless', 'ensuring they', 'environment', 'environmental', 'environments', 'evaluate', 'event', 'events', 'events including', 'execute', 'executives', 'experience', 'experience designers', 'experience of', 'experiences', 'experiences by', 'expert', 'expertise', 'facilitate', 'facilities', 'families', 'family', 'financial', 'financial goals', 'financial planning', 'findings', 'flow', 'focus', 'focus on', 'focuses', 'focuses on', 'focusing', 'focusing on', 'followers', 'followers and', 'for', 'for search', 'for various', 'for web', 'fostering', 'friendly', 'friendly and', 'friendly digital', 'from', 'front', 'front end', 'frontend', 'functional', 'functional teams', 'functionality', 'functionality and', 'gather', 'generate', 'goals', 'goals and', 'goods', 'goods and', 'graphics', 'growth', 'guidance', 'guide', 'handle', 'handling', 'hardware', 'hardware and', 'health', 'healthcare', 'help', 'helping', 'helping them', 'high', 'hr', 'identify', 'identify cost', 'identify defects', 'identify trends', 'identifying', 'impact', 'implement', 'implement and', 'implement security', 'implementing', 'improve', 'improve online', 'improving', 'in', 'in creating', 'in designing', 'in legal', 'in planning', 'in specific', 'in the', 'incidents', 'include', 'including', 'including servers', 'incorporate', 'increase', 'individuals', 'industrial', 'inform', 'information', 'informed', 'infrastructure', 'infrastructure ensuring', 'infrastructure including', 'initiatives', 'inquiries', 'insights', 'insights and', 'insights to', 'integration', 'integrity', 'integrity and', 'interaction', 'interaction designers', 'interactions', 'interactions within', 'interactive', 'interactive aspects', 'interface', 'interface designers', 'interfaces', 'interfaces for', 'interfaces they', 'interior', 'into', 'intuitive', 'intuitive and', 'inventory', 'inventory levels', 'investment', 'involves', 'is', 'is responsible', 'issues', 'issues and', 'it', 'it infrastructure', 'it systems', 'java', 'keyword', 'keyword research', 'knowledge', 'landscape', 'landscapes', 'language', 'law', 'laws', 'laws and', 'layouts', 'layouts buttons', 'lead', 'leads', 'learning', 'legal', 'legal counsel', 'legal matters', 'legal proceedings', 'level', 'levels', 'life', 'like', 'litigation', 'logic', 'logic and', 'logistics', 'logistics and', 'maintain', 'maintain data', 'maintain network', 'maintaining', 'maintenance', 'make', 'make informed', 'making', 'making within', 'manage', 'manage an', 'manage and', 'manage budgets', 'management', 'management and', 'manager', 'manager is', 'manager oversees', 'managers', 'managers oversee', 'manages', 'managing', 'managing schedules', 'manufacturing', 'market', 'market research', 'market trends', 'marketing', 'marketing campaigns', 'marketing efforts', 'marketing strategies', 'materials', 'matters', 'matters related', 'maximize', 'may', 'meaningful', 'meaningful and', 'measures', 'media', 'media managers', 'media metrics', 'media presence', 'media strategies', 'medical', 'medical care', 'medical conditions', 'meet', 'meet quality', 'meetings', 'meetings and', 'members', 'mergers', 'metrics', 'metrics to', 'minimize', 'mobile', 'models', 'monitor', 'monitor and', 'needs', 'needs and', 'needs of', 'negotiate', 'negotiate contracts', 'network', 'network administrators', 'network infrastructure', 'network performance', 'networks', 'networks and', 'networks they', 'new', 'nurse', 'nurse practitioners', 'nursing', 'objectives', 'of', 'of digital', 'of goods', 'of software', 'of web', 'of websites', 'offer', 'office', 'office operations', 'often', 'on', 'on the', 'on time', 'online', 'online visibility', 'operations', 'opportunities', 'opportunities and', 'optimal', 'optimal performance', 'optimize', 'optimize websites', 'optimizing', 'or', 'or products', 'or services', 'oral', 'organic', 'organic traffic', 'organization', 'organization they', 'organizational', 'organizations', 'organizations computer', 'organizations it', 'organizations social', 'organizing', 'other', 'other elements', 'overall', 'overall user', 'oversee', 'oversee an', 'oversees', 'pages', 'parties', 'patient', 'patients', 'pediatric', 'perform', 'performance', 'performance and', 'performance they', 'performing', 'personal', 'persuasive', 'plan', 'plan and', 'planners', 'planning', 'planning and', 'plans', 'plans and', 'platforms', 'pleasing', 'policies', 'portfolio', 'portfolios', 'positive', 'potential', 'power', 'practices', 'practitioners', 'preferences', 'preparation', 'presence', 'presence they', 'preventive', 'preventive care', 'problems', 'procedures', 'proceedings', 'process', 'processes', 'processes to', 'procurement', 'procurement processes', 'product', 'product availability', 'product quality', 'production', 'products', 'products or', 'products they', 'products to', 'professionals', 'programs', 'programs and', 'project', 'projects', 'promote', 'promoting', 'protect', 'prototypes', 'prototypes and', 'provide', 'provide insights', 'provide legal', 'provides', 'providing', 'public', 'purchasing', 'qa', 'quality', 'quality assurance', 'quality control', 'quality in', 'quality standards', 'rankings', 'reach', 'recommendations', 'recommendations for', 'recommendations to', 'records', 'recruitment', 'regulations', 'related', 'related to', 'relations', 'relationships', 'relationships with', 'reliability', 'reliable', 'report', 'reporting', 'reports', 'reports and', 'representatives', 'requirements', 'research', 'research and', 'research design', 'research optimize', 'resolve', 'resolve problems', 'resource', 'resources', 'respond', 'respond to', 'responses', 'responsible', 'responsible for', 'responsive', 'retirement', 'revenue', 'risks', 'role', 'safety', 'safety and', 'sales', 'sales and', 'sales representatives', 'sales targets', 'sales teams', 'satisfaction', 'saving', 'saving opportunities', 'schedule', 'schedule content', 'schedules', 'scheduling', 'scope', 'seamless', 'search', 'search engine', 'search engines', 'security', 'security and', 'security measures', 'seo', 'seo specialists', 'seo strategies', 'server', 'server side', 'servers', 'service', 'services', 'services they', 'services to', 'settings', 'side', 'skills', 'skills to', 'smooth', 'social', 'social media', 'software', 'software and', 'software applications', 'solutions', 'solutions and', 'solutions to', 'spaces', 'specialist', 'specialists', 'specialists focus', 'specialists optimize', 'specialize', 'specialize in', 'specializes', 'specializes in', 'specific', 'speech', 'staff', 'stakeholders', 'standards', 'standards they', 'stock', 'storage', 'strategic', 'strategies', 'strategies and', 'strategies they', 'strategies to', 'strategists', 'strategy', 'streamline', 'structures', 'students', 'success', 'such', 'such as', 'supply', 'supply chain', 'support', 'support data', 'support decision', 'support specialists', 'support the', 'support to', 'sustainability', 'sustainable', 'system', 'system responses', 'systems', 'systems and', 'target', 'target audiences', 'targets', 'tasks', 'tax', 'tax laws', 'tax planning', 'team', 'teams', 'teams to', 'techniques', 'technology', 'test', 'testing', 'tests', 'that', 'the', 'the company', 'the organization', 'the organizations', 'the overall', 'the server', 'the visual', 'their', 'their needs', 'them', 'therapist', 'therapy', 'they', 'they analyze', 'they assess', 'they build', 'they collaborate', 'they conduct', 'they configure', 'they create', 'they design', 'they develop', 'they diagnose', 'they ensure', 'they handle', 'they identify', 'they meet', 'they optimize', 'they plan', 'they provide', 'they track', 'they use', 'they work', 'threats', 'through', 'time', 'time and', 'to', 'to achieve', 'to create', 'to customers', 'to drive', 'to end', 'to enhance', 'to ensure', 'to improve', 'to increase', 'to individuals', 'to inform', 'to maintain', 'to maximize', 'to meet', 'to optimize', 'to promote', 'to provide', 'to streamline', 'to support', 'to the', 'tools', 'track', 'track performance', 'traffic', 'traffic and', 'training', 'training programs', 'transactions', 'transportation', 'treat', 'treatment', 'trends', 'trends and', 'troubleshoot', 'troubleshooting', 'troubleshooting issues', 'ui', 'ui ux', 'understand', 'understand their', 'use', 'use their', 'user', 'user behaviors', 'user experience', 'user experiences', 'user friendly', 'user interactions', 'user interface', 'user interfaces', 'user research', 'users', 'using', 'ux', 'valuable', 'valuable insights', 'various', 'vendor', 'vendor relationships', 'visibility', 'visual', 'visual and', 'visually', 'visually appealing', 'water', 'web', 'web applications', 'web pages', 'website', 'websites', 'websites and', 'websites for', 'websites they', 'wedding', 'weddings', 'well', 'well being', 'while', 'will', 'with', 'with clients', 'with cross', 'with development', 'with followers', 'with legal', 'with tax', 'with the', 'within', 'within an', 'within digital', 'work', 'work in', 'work on', 'work to', 'work with', 'working', 'works', 'write', 'you', 'you will', 'your', 'your role']
In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
# Load the datasets
job_data = pd.read_csv('job_descriptions.csv')
candidate_data = pd.read_csv('candidate_profiles.csv')
company_data = pd.read_csv('company_profiles.csv')
# Analyze skill gaps and match candidate profiles to job requirements
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)
X_job = vectorizer.fit_transform(job_data['Job Description'].dropna())
job_skills = vectorizer.get_feature_names_out()
X_candidate = vectorizer.transform(candidate_data['Skills'].dropna())
candidate_skills = vectorizer.get_feature_names_out()
# Calculate cosine similarity between job requirements and candidate skills
similarity_matrix = cosine_similarity(X_job, X_candidate)
# Identify matching candidates for each job
for i, job_row in job_data.iterrows():
job_title = job_row['Job Title']
job_skills = vectorizer.transform([job_row['Job Description']])
job_candidates = candidate_data.loc[similarity_matrix[i] > 0.7] # Adjust similarity threshold as needed
print(f"Matching candidates for {job_title}: {', '.join(job_candidates['Name'])}")
# Identify emerging skills and responsibilities
# Your code for analyzing job descriptions and identifying emerging skills and responsibilities goes here
# Analyze company profiles
# Your code for analyzing company profiles and understanding company reputation, values, and culture goes here
# Assessing company branding impact
# Your code for analyzing the impact of company branding on candidate attraction and retention goes here
# Identifying alignment between company values and candidate preferences
# Your code for comparing company values with candidate preferences and identifying alignment goes here
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[3], line 8 6 # Load the datasets 7 job_data = pd.read_csv('job_descriptions.csv') ----> 8 candidate_data = pd.read_csv('candidate_profiles.csv') 9 company_data = pd.read_csv('company_profiles.csv') 11 # Analyze skill gaps and match candidate profiles to job requirements File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend) 935 kwds_defaults = _refine_defaults_read( 936 dialect, 937 delimiter, (...) 944 dtype_backend=dtype_backend, 945 ) 946 kwds.update(kwds_defaults) --> 948 return _read(filepath_or_buffer, kwds) File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:611, in _read(filepath_or_buffer, kwds) 608 _validate_names(kwds.get("names", None)) 610 # Create the parser. --> 611 parser = TextFileReader(filepath_or_buffer, **kwds) 613 if chunksize or iterator: 614 return parser File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds) 1445 self.options["has_index_names"] = kwds["has_index_names"] 1447 self.handles: IOHandles | None = None -> 1448 self._engine = self._make_engine(f, self.engine) File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1705, in TextFileReader._make_engine(self, f, engine) 1703 if "b" not in mode: 1704 mode += "b" -> 1705 self.handles = get_handle( 1706 f, 1707 mode, 1708 encoding=self.options.get("encoding", None), 1709 compression=self.options.get("compression", None), 1710 memory_map=self.options.get("memory_map", False), 1711 is_text=is_text, 1712 errors=self.options.get("encoding_errors", "strict"), 1713 storage_options=self.options.get("storage_options", None), 1714 ) 1715 assert self.handles is not None 1716 f = self.handles.handle File ~\anaconda3\Lib\site-packages\pandas\io\common.py:863, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 858 elif isinstance(handle, str): 859 # Check whether the filename is to be opened in binary mode. 860 # Binary mode does not support 'encoding' and 'newline'. 861 if ioargs.encoding and "b" not in ioargs.mode: 862 # Encoding --> 863 handle = open( 864 handle, 865 ioargs.mode, 866 encoding=ioargs.encoding, 867 errors=errors, 868 newline="", 869 ) 870 else: 871 # Binary mode 872 handle = open(handle, ioargs.mode) FileNotFoundError: [Errno 2] No such file or directory: 'candidate_profiles.csv'
In [5]:
import pandas as pd
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_csv('job_descriptions.csv')
# Concatenate all job descriptions into a single string
all_descriptions = ' '.join(data['Job Description'].dropna())
# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions)
# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Job Descriptions')
plt.axis('off')
plt.show()
# Analyze sentiment of job descriptions
sentiments = data['Job Description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
# Plot the distribution of sentiment scores
plt.figure(figsize=(8, 6))
plt.hist(sentiments, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Scores in Job Descriptions')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[5], line 13 10 all_descriptions = ' '.join(data['Job Description'].dropna()) 12 # Generate a word cloud ---> 13 wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions) 15 # Plot the word cloud 16 plt.figure(figsize=(10, 6)) NameError: name 'WordCloud' is not defined
In [6]:
import pandas as pd
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_csv('job_descriptions.csv')
# Concatenate all job descriptions into a single string
all_descriptions = ' '.join(data['Job Description'].dropna())
# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Job Descriptions')
plt.axis('off')
plt.show()
# Analyze sentiment of job descriptions
sentiments = data['Job Description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
# Plot the distribution of sentiment scores
plt.figure(figsize=(8, 6))
plt.hist(sentiments, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Scores in Job Descriptions')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[6], line 2 1 import pandas as pd ----> 2 from wordcloud import WordCloud 3 import matplotlib.pyplot as plt 4 from textblob import TextBlob ModuleNotFoundError: No module named 'wordcloud'
In [7]:
pip install wordcloud
Collecting wordcloud Downloading wordcloud-1.9.3-cp311-cp311-win_amd64.whl.metadata (3.5 kB) Requirement already satisfied: numpy>=1.6.1 in c:\users\pc\anaconda3\lib\site-packages (from wordcloud) (1.26.4) Requirement already satisfied: pillow in c:\users\pc\anaconda3\lib\site-packages (from wordcloud) (10.2.0) Requirement already satisfied: matplotlib in c:\users\pc\anaconda3\lib\site-packages (from wordcloud) (3.8.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (23.1) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\pc\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0) Downloading wordcloud-1.9.3-cp311-cp311-win_amd64.whl (300 kB) ---------------------------------------- 0.0/300.2 kB ? eta -:--:-- -- ------------------------------------ 20.5/300.2 kB 330.3 kB/s eta 0:00:01 ----- --------------------------------- 41.0/300.2 kB 495.5 kB/s eta 0:00:01 -------------- ----------------------- 112.6/300.2 kB 819.2 kB/s eta 0:00:01 -------------- ----------------------- 112.6/300.2 kB 819.2 kB/s eta 0:00:01 ---------------------------- --------- 225.3/300.2 kB 981.9 kB/s eta 0:00:01 --------------------------------------- 297.0/300.2 kB 1.1 MB/s eta 0:00:01 -------------------------------------- 300.2/300.2 kB 976.2 kB/s eta 0:00:00 Installing collected packages: wordcloud Successfully installed wordcloud-1.9.3 Note: you may need to restart the kernel to use updated packages.
In [10]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
# Load the dataset
data = pd.read_csv('job_descriptions.csv')
# Concatenate all job descriptions into a single string
all_descriptions = ' '.join(data['Job Description'].dropna())
# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions)
# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Job Descriptions')
plt.axis('off')
plt.show()
# Analyze sentiment of job descriptions
sentiments = data['Job Description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
# Plot the distribution of sentiment scores
plt.figure(figsize=(8, 6))
plt.hist(sentiments, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Scores in Job Descriptions')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()
In [13]:
data.describe()
Out[13]:
| Job Id | latitude | longitude | Company Size | |
|---|---|---|---|---|
| count | 1.615940e+06 | 1.615940e+06 | 1.615940e+06 | 1.615940e+06 |
| mean | 1.548935e+15 | 1.937743e+01 | 1.639926e+01 | 7.370467e+04 |
| std | 8.946722e+14 | 2.355690e+01 | 7.066762e+01 | 3.529886e+04 |
| min | 1.817948e+11 | -4.090060e+01 | -1.751982e+02 | 1.264600e+04 |
| 25% | 7.740508e+14 | 5.152100e+00 | -1.531010e+01 | 4.311400e+04 |
| 50% | 1.547858e+15 | 1.807080e+01 | 1.914510e+01 | 7.363300e+04 |
| 75% | 2.323729e+15 | 3.907420e+01 | 4.757690e+01 | 1.043000e+05 |
| max | 3.099618e+15 | 7.170690e+01 | 1.780650e+02 | 1.348340e+05 |
In [ ]:
data.columns;
In [23]:
from collections import Counter
import re
from nltk.corpus import stopwords
# Download stopwords
import nltk
nltk.download('stopwords')
# Tokenize and remove stopwords
stop_words = set(stopwords.words('english'))
words = re.findall(r'\w+', all_descriptions.lower())
filtered_words = [word for word in words if word not in stop_words]
# Get the most common words
word_counts = Counter(filtered_words)
common_words = word_counts.most_common(20)
# Plot the most common words
words, counts = zip(*common_words)
plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='skyblue')
plt.title('Most Common Words in Job Descriptions')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\pc\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
In [24]:
skills = ['Python', 'Excel', 'SQL', 'Java', 'C++', 'JavaScript', 'R', 'Machine Learning', 'Deep Learning']
skill_counts = {skill: all_descriptions.lower().count(skill.lower()) for skill in skills}
# Plot skill frequencies
plt.figure(figsize=(10, 6))
plt.bar(skill_counts.keys(), skill_counts.values(), color='skyblue')
plt.title('Skill Frequency in Job Descriptions')
plt.xlabel('Skills')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
In [26]:
# Plot the number of job postings per country
country_counts = data['Country'].value_counts()
plt.figure(figsize=(10, 6))
country_counts.plot(kind='bar', color='skyblue')
plt.title('Job Postings by Country')
plt.xlabel('Country')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
In [27]:
# Plot the distribution of job types
job_type_counts = data['Work Type'].value_counts()
plt.figure(figsize=(10, 6))
job_type_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Job Types')
plt.xlabel('Job Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
In [29]:
import seaborn as sns
# Remove NaN values for plotting
experience_salary_data = data[['Experience', 'Salary Range']].dropna()
plt.figure(figsize=(10, 6))
sns.regplot(x='Experience', y='Salary Range', data=experience_salary_data, scatter_kws={'color': 'skyblue'}, line_kws={'color': 'red'})
plt.title('Experience vs Salary Range')
plt.xlabel('Years of Experience')
plt.ylabel('Salary Range')
plt.show()
In [30]:
# For the heatmap, we'll need a pivot table
country_job_counts = data.pivot_table(index='Country', columns='Job Title', aggfunc='size', fill_value=0)
plt.figure(figsize=(14, 10))
sns.heatmap(country_job_counts, cmap='Blues', annot=True, fmt='d')
plt.title('Heatmap of Job Postings by Country and Job Title')
plt.xlabel('Job Title')
plt.ylabel('Country')
plt.show()
In [31]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='Work Type', y=sentiments, data=data, inner='quartile', palette='pastel')
plt.title('Sentiment Distribution by Job Type')
plt.xlabel('Job Type')
plt.ylabel('Sentiment Score')
plt.show()
In [32]:
data['Job Posting Date'] = pd.to_datetime(data['Job Posting Date'])
data.set_index('Job Posting Date', inplace=True)
# Resample to get job postings per month
monthly_postings = data.resample('M').size()
plt.figure(figsize=(12, 6))
plt.plot(monthly_postings.index, monthly_postings, marker='o', color='skyblue')
plt.title('Job Postings Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Postings')
plt.grid(True)
plt.show()
In [ ]: